{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostRegressor\n",
    "from sklearn.metrics import mean_squared_error\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# first data of groupby fill weather missing merged\n",
    "df_train= pd.read_csv('../../Large_output/train_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def features_engineering(df):\n",
    "    \n",
    "    # Sort by localtime\n",
    "    df.sort_values(\"local_time\")\n",
    "    df.reset_index(drop=True)\n",
    "    \n",
    "    # Add more features\n",
    "    df[\"local_time\"] = pd.to_datetime(df[\"local_time\"],format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    df[\"hour\"] = df[\"local_time\"].dt.hour\n",
    "    df[\"weekend\"] = df[\"local_time\"].dt.weekday\n",
    "    df['square_feet'] =  np.log1p(df['square_feet'])\n",
    "    \n",
    "    \n",
    "    # Encode Categorical Data\n",
    "    le = LabelEncoder()\n",
    "    df[\"primary_use\"] = le.fit_transform(df[\"primary_use\"])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2271.93 MB\n",
      "Memory usage after optimization is: 568.74 MB\n",
      "Decreased by 75.0%\n"
     ]
    }
   ],
   "source": [
    "df_train = reduce_mem_usage(df_train,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_engineer = features_engineering(df_train)\n",
    "train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\\\n",
    "=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)\n",
    "target = np.log1p(train_engineer[\"meter_reading\"])\n",
    "features = train_engineer[['building_id', 'meter','site_id','primary_use', \n",
    "                          'square_feet','air_temperature','cloud_coverage',\n",
    "                          'dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]\n",
    "del df_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = [\"building_id\", \"site_id\", \"meter\", \n",
    "                        \"primary_use\",  \"weekend\",'is_holiday']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {'n_estimators': 2000,\n",
    "          'learning_rate':0.05,\n",
    "          'depth':12,\n",
    "          'eval_metric': 'RMSE',\n",
    "          'loss_function': 'RMSE',\n",
    "          'early_stopping_rounds' : 50,\n",
    "          'random_state':42,\n",
    "          'metric_period': 100,}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:\tlearn: 2.0197523\ttest: 2.0568292\tbest: 2.0568292 (0)\ttotal: 7.46s\tremaining: 4h 8m 39s\n",
      "100:\tlearn: 1.0631220\ttest: 1.2024373\tbest: 1.2024373 (100)\ttotal: 10m 58s\tremaining: 3h 26m 16s\n",
      "200:\tlearn: 0.9971614\ttest: 1.1719861\tbest: 1.1719861 (200)\ttotal: 22m 52s\tremaining: 3h 24m 42s\n",
      "300:\tlearn: 0.9585480\ttest: 1.1571542\tbest: 1.1571542 (300)\ttotal: 34m 30s\tremaining: 3h 14m 46s\n",
      "400:\tlearn: 0.9306938\ttest: 1.1495907\tbest: 1.1495907 (400)\ttotal: 46m 4s\tremaining: 3h 3m 45s\n",
      "500:\tlearn: 0.9091359\ttest: 1.1437987\tbest: 1.1436488 (498)\ttotal: 57m 56s\tremaining: 2h 53m 21s\n",
      "600:\tlearn: 0.8928639\ttest: 1.1409205\tbest: 1.1409205 (600)\ttotal: 1h 9m 48s\tremaining: 2h 42m 30s\n",
      "700:\tlearn: 0.8811650\ttest: 1.1384600\tbest: 1.1383988 (691)\ttotal: 1h 21m 33s\tremaining: 2h 31m 8s\n",
      "800:\tlearn: 0.8700997\ttest: 1.1364594\tbest: 1.1364406 (799)\ttotal: 1h 33m 38s\tremaining: 2h 20m 9s\n",
      "900:\tlearn: 0.8603185\ttest: 1.1353025\tbest: 1.1353025 (900)\ttotal: 1h 45m 40s\tremaining: 2h 8m 53s\n",
      "1000:\tlearn: 0.8518508\ttest: 1.1350071\tbest: 1.1349768 (999)\ttotal: 1h 58m 1s\tremaining: 1h 57m 47s\n",
      "Stopped by overfitting detector  (50 iterations wait)\n",
      "\n",
      "bestTest = 1.134523427\n",
      "bestIteration = 1047\n",
      "\n",
      "Shrink model to first 1048 iterations.\n",
      "Fold 1 | rmse: 1.134523427413442\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:\tlearn: 2.0422157\ttest: 2.0181382\tbest: 2.0181382 (0)\ttotal: 6.86s\tremaining: 3h 48m 28s\n",
      "100:\tlearn: 1.0744251\ttest: 1.2112417\tbest: 1.2112417 (100)\ttotal: 11m\tremaining: 3h 27m 4s\n",
      "200:\tlearn: 1.0138663\ttest: 1.1410900\tbest: 1.1410900 (200)\ttotal: 22m 54s\tremaining: 3h 24m 57s\n",
      "300:\tlearn: 0.9761728\ttest: 1.1121296\tbest: 1.1121296 (300)\ttotal: 34m 40s\tremaining: 3h 15m 43s\n",
      "400:\tlearn: 0.9511017\ttest: 1.0948035\tbest: 1.0948035 (400)\ttotal: 46m 43s\tremaining: 3h 6m 20s\n",
      "500:\tlearn: 0.9329604\ttest: 1.0856865\tbest: 1.0856865 (500)\ttotal: 58m 45s\tremaining: 2h 55m 47s\n",
      "600:\tlearn: 0.9184059\ttest: 1.0791304\tbest: 1.0791304 (600)\ttotal: 1h 10m 52s\tremaining: 2h 44m 59s\n",
      "700:\tlearn: 0.9073367\ttest: 1.0737496\tbest: 1.0737496 (700)\ttotal: 1h 23m\tremaining: 2h 33m 48s\n",
      "800:\tlearn: 0.8976513\ttest: 1.0693314\tbest: 1.0693314 (800)\ttotal: 1h 35m 5s\tremaining: 2h 22m 21s\n",
      "900:\tlearn: 0.8891734\ttest: 1.0677035\tbest: 1.0672219 (884)\ttotal: 1h 47m 31s\tremaining: 2h 11m 9s\n",
      "1000:\tlearn: 0.8817576\ttest: 1.0650280\tbest: 1.0649716 (998)\ttotal: 1h 59m 39s\tremaining: 1h 59m 25s\n",
      "Stopped by overfitting detector  (50 iterations wait)\n",
      "\n",
      "bestTest = 1.064889366\n",
      "bestIteration = 1012\n",
      "\n",
      "Shrink model to first 1013 iterations.\n",
      "Fold 2 | rmse: 1.0648893660405658\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:\tlearn: 2.0336521\ttest: 2.0309568\tbest: 2.0309568 (0)\ttotal: 6.67s\tremaining: 3h 42m 18s\n",
      "100:\tlearn: 1.0377871\ttest: 1.2119426\tbest: 1.2119426 (100)\ttotal: 10m 56s\tremaining: 3h 25m 40s\n",
      "200:\tlearn: 0.9587146\ttest: 1.1904469\tbest: 1.1904081 (199)\ttotal: 22m 45s\tremaining: 3h 23m 44s\n",
      "300:\tlearn: 0.9146566\ttest: 1.1796038\tbest: 1.1796038 (300)\ttotal: 34m 32s\tremaining: 3h 15m\n",
      "400:\tlearn: 0.8855939\ttest: 1.1717177\tbest: 1.1717177 (400)\ttotal: 46m 22s\tremaining: 3h 4m 54s\n",
      "500:\tlearn: 0.8648077\ttest: 1.1684259\tbest: 1.1684132 (496)\ttotal: 58m 14s\tremaining: 2h 54m 14s\n",
      "600:\tlearn: 0.8491754\ttest: 1.1659104\tbest: 1.1657589 (598)\ttotal: 1h 9m 49s\tremaining: 2h 42m 31s\n",
      "700:\tlearn: 0.8342476\ttest: 1.1642697\tbest: 1.1641529 (699)\ttotal: 1h 21m 43s\tremaining: 2h 31m 27s\n",
      "800:\tlearn: 0.8231683\ttest: 1.1638116\tbest: 1.1634155 (765)\ttotal: 1h 33m 41s\tremaining: 2h 20m 14s\n",
      "Stopped by overfitting detector  (50 iterations wait)\n",
      "\n",
      "bestTest = 1.163415455\n",
      "bestIteration = 765\n",
      "\n",
      "Shrink model to first 766 iterations.\n",
      "Fold 3 | rmse: 1.163415455158452\n",
      "\n",
      "Mean rmse = 1.1209427495374866\n",
      "Out of folds rmse = 1.121705292632275\n"
     ]
    }
   ],
   "source": [
    "NFOLDS = 3\n",
    "columns = features.columns\n",
    "kf = KFold(n_splits=3)\n",
    "splits = kf.split(features, target)\n",
    "y_oof = np.zeros(features.shape[0])\n",
    "score = 0\n",
    "out_folder_train_prediction= pd.DataFrame()\n",
    "feature_importance_df = pd.DataFrame()\n",
    "\n",
    "models = []\n",
    "\n",
    "for fold_n, (train_index, valid_index) in enumerate(splits):\n",
    "    X_tr=features.iloc[train_index]\n",
    "    y_tr=target.iloc[train_index]\n",
    "    X_val=features.iloc[valid_index]\n",
    "    y_val=target.iloc[valid_index]\n",
    "    \n",
    "    model = CatBoostRegressor(**params)\n",
    "        \n",
    "    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), cat_features=categorical_features, verbose=True)\n",
    "\n",
    "    y_pred_valid = model.predict(X_val)\n",
    "    y_oof[valid_index] = y_pred_valid\n",
    "    print(f\"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_val, y_pred_valid))}\")\n",
    "    \n",
    "    score += np.sqrt(mean_squared_error(y_val, y_pred_valid)) / NFOLDS\n",
    "    \n",
    "    oof_preds=pd.DataFrame()\n",
    "    oof_preds['train_index']=valid_index\n",
    "    oof_preds['TARGET']= y_pred_valid\n",
    "    oof_preds[\"folder\"]=fold_n + 1\n",
    "    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)\n",
    "    \n",
    "    \n",
    "    fold_importance_df = pd.DataFrame()\n",
    "    fold_importance_df['feature']=columns\n",
    "    fold_importance_df['importance']=model.get_feature_importance()\n",
    "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "    \n",
    "    models.append(model)\n",
    "    del X_val,X_tr,y_val,y_tr\n",
    "    gc.collect()\n",
    "    \n",
    "print(f\"\\nMean rmse = {score}\")\n",
    "print(f\"Out of folds rmse = {np.sqrt(mean_squared_error(target, y_oof))}\")\n",
    "out_folder_train_prediction.to_csv('out_folder_train_prediction_cat.csv',index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 4771.91 MB\n",
      "Memory usage after optimization is: 1671.69 MB\n",
      "Decreased by 65.0%\n"
     ]
    }
   ],
   "source": [
    "test_feature = pd.read_csv('../../Large_output/test_merge.csv')\n",
    "test_feature = reduce_mem_usage(test_feature)\n",
    "test_feature = features_engineering(test_feature)\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday', 'row_id']]\n",
    "row_ids = test_feature[['row_id']]\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "results = []\n",
    "for model in models:\n",
    "    if  results == []:\n",
    "        results = np.expm1(model.predict(test_feature)) / len(models)\n",
    "    else:\n",
    "        results += np.expm1(model.predict(test_feature)) / len(models)\n",
    "    del model\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature['meter_reading']=np.clip(results, 0, a_max=None)\n",
    "test_feature.loc[(test_feature['site_id']==0) & \n",
    "                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &\n",
    "                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)\n",
    "df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})\n",
    "df_result.to_csv('../../Large_output/cat_init_kf3.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
